import numpy as np
import scipy.sparse as sparse
import scipy.io
import commands
import csv
import os
from itertools import izip 
import sys

# sys.path.append('../deob/')
# import utils


class OUTLOG:
	def __init__(self, logfile = '../../data/ExtractionLogs/COUNTERLOG'):
		self.logfile = logfile

	def ADD(self, record):
		self.logf = open(self.logfile, 'a')
		self.logf.write(record)
		self.logf.write('\n')
		self.logf.close()

	def ADDCOUNTER(self, counters):
		self.logf = open(self.logfile, 'a')
		for key, value in counters.items():
			self.logf.write(str(key) + ',' + str(value) + ';')
		self.logf.write('\n')
		self.logf.close()


# deprecated
def isBadFeature(row, meanBBar, meanMBar, varBBar, varMBar, allcounters):
	size = len(row)
	benign = row[:size/2]
	malicious = row[size/2:]
	meanB = np.mean(benign)
	meanM =  np.mean(malicious)
	if meanB - meanM > meanBBar:
		allcounters['meanBBar'] += 1
		return False
	if meanM - meanB > meanMBar:
		allcounters['meanMBar'] += 1
		return False
	# Only use mean to do filtering
	# return True
	varB = np.std(benign)
	varM = np.std(malicious)
	if varB - varM > varBBar:
		allcounters['varBBar'] += 1
		return False
	if varM - varB > varMBar:
		allcounters['varMBar'] += 1
		return False
	return True
	

# deprecated
def roughSelection(infile, featurefile, outfile, outfeaturefile, meanBBar, meanMBar, varBBar, varMBar):
	with open(infile, 'r') as inf:
		reader = csv.reader(inf)

		with open(featurefile, 'r') as featuref:
			outf = open(outfile, 'w')
			writer = csv.writer(outf)
			outfeaturef = open(outfeaturefile, 'w')

			# Log the progress of reading the file
			# log = utils.LOG('../../data/ExtractionLogs/LOG')
			# progress = log.STATUS(infile)

			allcounters = dict()
			allcounters['meanMBar'] = 0
			allcounters['meanBBar'] = 0
			allcounters['varMBar'] = 0
			allcounters['varBBar'] = 0
			# count = 0
			for row in reader:
				# count = count + 1
				# if count <= progress:
				#	continue

				feature = featuref.readline()
				floatrow = np.array(row).astype(np.float)
				if (isBadFeature(floatrow, meanBBar, meanMBar, varBBar, varMBar, allcounters)):
					continue
				writer.writerow(floatrow)
				outfeaturef.write(feature)

				# log.UPDATE(infile, count)
			outfeaturef.write(featuref.readline())
			outfeaturef.close()

			print allcounters
	return allcounters


# deprecated
def callRoughSelection(MB, MM, VB, VM):
	meanBBar = float(MB) / 10
	meanMBar = float(MM) / 10
	varBBar = float(VB) / 10
	varMBar = float(VM) / 10
	print meanBBar, meanMBar, varBBar, varMBar
	counters = roughSelection('../../data/ExtractionLogs/NormalizedTFeature.csv', '../../data/ExtractionLogs/Flat-Level-Features-In-CSV-Order.log', '/space/webcrawl-malware/data/ExtractionLogs/SF{0}{1}{2}{3}.csv'.format(MB, MM, VB, VM), '/space/webcrawl-malware/data/ExtractionLogs/SF{0}{1}{2}{3}.log'.format(MB, MM, VB, VM), meanBBar, meanMBar, varBBar, varMBar)

	log = OUTLOG()
	log.ADD('Benign Mean:' + MB + ', Malicious Mean:' + MM + ', Benign Variance:' + VB + ', Malicious Variance:' + VM)
	log.ADDCOUNTER(counters)
	

# deprecated
def callRoughSelectionPrevious(MB, MM, VB, VM, previousMB):
	meanBBar = float(MB) / 10
	meanMBar = float(MM) / 10
	varBBar = float(VB) / 10
	varMBar = float(VM) / 10
	print meanBBar, meanMBar, varBBar, varMBar
	counters = roughSelection('../../data/ExtractionLogs/SF{0}{1}{2}{3}.csv'.format(previousMB, MM, VB, VM), '../../data/ExtractionLogs/SF{0}{1}{2}{3}.log'.format(previousMB, MM, VB, VM), '/space/webcrawl-malware/data/ExtractionLogs/SF{0}{1}{2}{3}.csv'.format(MB, MM, VB, VM), '/space/webcrawl-malware/data/ExtractionLogs/SF{0}{1}{2}{3}.log'.format(MB, MM, VB, VM), meanBBar, meanMBar, varBBar, varMBar)

	log = OUTLOG()
	log.ADD('Benign Mean:' + MB + ', Malicious Mean:' + MM + ', Benign Variance:' + VB + ', Malicious Variance:' + VM)
	log.ADDCOUNTER(counters)
	


def cpFile(infile, outfile):
	commands.getoutput('cp ' + infile + ' ' + outfile)


def binarizeFile(infile, outfile, threshold=0):
	'''
	Binarize a file.
	The threshold is default to 0. Say, if num is greater than 0, then 1, else 0.
	'''
	print 'Binarizing..................................'
	with open(infile, 'r') as inf:
		reader = csv.reader(inf)
		outf = open(outfile, 'wb')
		writer = csv.writer(outf)
		for row in reader:
			if len(row) == 0:
				continue
			floatrow = np.array(row).astype(np.float)
			binarizedrow = [1 if ele > threshold else 0 for ele in floatrow]
			writer.writerow(binarizedrow)
		outf.close()


def normalizeFile(infile, outfile):
        print 'Normalizing..................................'
	with open(infile, 'r') as inf:
		reader = csv.reader(inf)
		outf = open(outfile, 'w')
		writer = csv.writer(outf)
		for row in reader:
			floatrow = np.array(row).astype(np.float)
			floatrow = floatrow / max(floatrow)
			# writer.writerow(['{0:.6f}'.format(x) for x in floatrow])
			writer.writerow(floatrow)
		outf.close()
			

def transposeLargeFile(infile, outfile, complete=True):
	print 'Transposing..................................'
	with open(infile, 'r') as inf:
		reader = csv.reader(inf)

		rowcount = 0
		tempoutfile = outfile + '.temp'
		for row in reader:
			if len(row) == 0:
				continue
			# if the last separator is ',', then skip it.
			if row[-1] == '':
				row = row[:-1]
			rowcount = rowcount + 1
			outf = open(outfile, 'w')
			if rowcount == 1:
				for r in row:
					line = r + '\n'
					outf.write(line)
			else:
				tempinf = open(tempoutfile, 'r')
				count = 0
				row_length = len(row)
				for line in tempinf:
					newline = line[:-1] + ',' + row[count] + '\n'
					outf.write(newline)
					count = count + 1
				if not complete:
					prefix = ''
					for i in range(1, rowcount):
						prefix = prefix + '0,'
					for index in range(count, row_length):
						newline = prefix + row[index] + '\n'
						outf.write(newline)
				tempinf.close()
			outf.close()
			cpFile(outfile, tempoutfile)
		os.remove(tempoutfile)
		
	'''
	matrix = np.loadtxt(infile, delimiter=',')
	print matrix.shape
	'''

	
def transpose(matrix):
	return np.transpose(matrix)	


def transposeSmallFile(infile, outfile, movelabel=False):
	data = np.genfromtxt(infile, dtype=int, delimiter=',')
	if movelabel == True:
		# copy, otherwise it will be overwritten.
		labelrow = data[0,:].copy()
		data[:-1,:] = data[1:,:]
		data[-1,:] = labelrow
	data = np.transpose(data)
	outf = open(outfile, 'w')
	writer = csv.writer(outf, lineterminator='\n')
	for row in data:
		writer.writerow(row)
	outf.close()


class SparseMatrix:
	def __init__(self, infile=None, rows=0, columns=0, fmt='v'):
		if infile == None:
			return
		if rows == 0 or columns == 0:
			self.rows, self.columns = self.getRowCol(infile)
		else:
			self.rows = rows
			self.columns = columns
		print 'Row count {0}, Column count {1}'.format(self.rows, self.columns)
		self.csvreader = csv.reader(open(infile, 'r'))
		self.matrix = sparse.lil_matrix( (self.rows, self.columns) , dtype='int')
		rownum = 0
		for row in self.csvreader:
			row = row[:-1] if row[-1] == '' else row
			introw = np.array(row).astype(np.int)
			self.matrix[rownum, :len(introw)] = introw
			rownum = rownum + 1
			print 'Progress {0}'.format(rownum)
		print 'Matrix load successful!'
		if fmt == 'v':
			self.transpose()
		classrow = self.matrix[0,:]
		classrow = self.formatRow(classrow)
		self.classTotals = self.histogramRow(classrow, max(classrow)+1)

	def getRowCol(self, infile):
		rowcount = 0
		colcount = 0
		csvreader = csv.reader(open(infile, 'r'))
		for line in csvreader:
			rowcount = rowcount + 1
			columns = len(line) - 1 if line[-1] == '' else len(line)
			colcount = columns if colcount < columns else colcount
		return rowcount, colcount

	def transpose(self):
		self.matrix = self.matrix.transpose()
		print 'Matrix transposed!'

	def normalizeRow(self, row):
		top = max(row)
		if top == 0:
			return row
		else:
			return row/top

	def binarizeRow(self, row, threshold):
		return [1 if ele > threshold else 0 for ele in row]

	def logCeilRow(self, row):
		return np.ceil(np.log10(row+1)).astype(int)

	def histogramRow(self, row, bincount=10):
		print bincount
		'''
		bincount means number of bins.
		If set to 10, means bin [0,1),[1,2) ... [9,10)
		'''
		return np.histogram(row, bins=range(bincount+1))[0]

	def formatRow(self, row):
		return np.squeeze(np.asarray(row.todense()))


#  deprecated
def iteration():
	for MM in np.arange(0.5, 2.5, 0.5):
		for VB in np.arange(2, 5, 1):
			for VM in np.arange(2, 5, 1):
				previousMB = 0
				for MB in np.arange(1, 4, 0.5):
					print str(MB),str(MM), str(VB), str(VM)
					if MB == 1:
						callRoughSelection(str(MB), str(MM), str(VB), str(VM))
						previousMB = MB
					else:
						callRoughSelectionPrevious(str(MB), str(MM), str(VB), str(VM), previousMB = str(previousMB))
						previousMB = MB


# deprecated
def processFile(indir='/space/outputlogs112/', threshold=0, complete=True):
	transposeLargeFile('{0}Flat-Level-Count.csv'.format(indir), '{0}TransposeFeature.csv'.format(indir), complete)
        normalizeFile('{0}TransposeFeature.csv'.format(indir), '{0}NormalizedTFeature.csv'.format(indir))
	binarizeFile('{0}NormalizedTFeature.csv'.format(indir), '{0}BinarizeFeature{1}.csv'.format(indir, threshold), threshold)


# Deprecated.
def old_main():
	# Use different metrics to select features.
	argc = len(sys.argv)
	if argc == 5:
		MB = sys.argv[1]
		MM = sys.argv[2]
		VB = sys.argv[3]
		VM = yss.argv[4]
	elif argc == 2:
		MB, MM, VB, VM = sys.argv[1].split(',')
	elif argc == 1:
		iteration()
		exit(0)
	else:
		print "Error: Please specify benign mean bar, malicious mean bar, benign variance bar, malicious variance bar"
		print "Format: python modelUtils.py 'MB,MM,VB,VM' or MB MM VB VM"
		exit(1)
	callRoughSelection(MB, MM, VB, VM)


def logDimension(files, logfile = '../../data/TempData/LOGs/dimension.log'):
	logf = open(logfile, 'a')
	spm = SparseMatrix()
	for f in files:
		rowcount, colcount = spm.getRowCol(f)
		logf.write('{0},{1},{2}\n'.format(f, rowcount, colcount))
	logf.close()


if __name__=="__main__":
	# processFile('/space/outputlogsTerry1000/', 0, complete=False)
	# SparseMatrix('/data/urlcrawl/outputlogsTerry2000/Flat-Level-Count.csv')
	# SparseMatrix('/data/urlcrawl/outputlogs112/Flat-Level-Count.csv')
	# logDimension(['/data/urlcrawl/outputlogsTerry2000/Flat-Level-Count.csv', '/data/urlcrawl/outputlogs112/Flat-Level-Count.csv'])

	# example parameters
	basedir = '/space/outputlogsTerry200B800M'
	method = 'scaleHist'
	thresbin = 10
	pthreshold = 0.0001
	if len(sys.argv) == 5:
		[basedir, method, thresbin, pthreshold] = sys.argv[1:]

	transposeSmallFile('{0}/features_{1}_{2}_{3}.csv'.format(basedir, method, thresbin, pthreshold), \
		'{0}/ARFF_{1}_{2}_{3}.csv'.format(basedir, method, thresbin, pthreshold), True)


